/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 * SimpleKMeans.java
 * Copyright (C) 2000 Mark Hall
 */


import java.io.*;
import java.util.*;

import weka.clusterers.ClusterEvaluation;
import weka.clusterers.SimpleKMeans;
import weka.core.*;
import weka.filters.Filter;
import weka.gui.beans.Clusterer;



/**
 * Simple k means clustering class.
 * 
 * Valid options are:
 * <p>
 * 
 * -N <number of clusters> <br>
 * Specify the number of clusters to generate.
 * <p>
 * 
 * -S <seed> <br>
 * Specify random number seed.
 * <p>
 * 
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @version $Revision: 1.6 $
 * @see Clusterer
 * @see OptionHandler
 */
public class SimpleKMean extends Clusterer 
{

	/**
	 * training instances
	 */
	protected Instances m_instances;

	/**
	 * replace missing values in training instances
	 */


	/**
	 * number of clusters to generate
	 */
	protected int m_NumClusters = 2;

	/**
	 * holds the cluster centroids
	 */
	protected Instances m_ClusterCentroids;

	/**
	 * temporary variable holding cluster assignments while iterating
	 */
	protected int[] m_ClusterAssignments;

	/**
	 * random seed
	 */
	private int m_Seed = 10;

	/**
	 * attribute min values
	 */
	protected double[] m_Min;

	/**
	 * attribute max values
	 */
	protected double[] m_Max;

	/**
	 * Returns a string describing this clusterer
	 * 
	 * @return a description of the evaluator suitable for
	 *         displaying in the explorer/experimenter gui
	 */
	public String globalInfo()
	{
		return "Cluster data using the k means algorithm";
	}

	/**
	 * Generates a clusterer. Has to initialize all fields of the clusterer
	 * that are not being set via options.
	 * 
	 * @param data
	 *            set of instances serving as training data
	 * @exception Exception
	 *                if the clusterer has not been
	 *                generated successfully
	 */
	public void buildClusterer(Instances data, Instances attackCentroids, Instance normalCentroid) throws Exception
	{

		if (data.checkForStringAttributes())
		{
			throw new Exception("Can't handle string attributes!");
		}


		m_instances = data;

		m_Min = new double[m_instances.numAttributes()];
		m_Max = new double[m_instances.numAttributes()];
		for (int i = 0; i < m_instances.numAttributes(); i++)
		{
			m_Min[i] = m_Max[i] = Double.NaN;
		}

		for (int i = 0; i < m_instances.numInstances(); i++)
		{
			updateMinMax(m_instances.instance(i));
		}
		m_NumClusters = attackCentroids.numInstances() +1; // assign number of clusters to #attackclusters + 1 for the normal cluster-
		m_ClusterCentroids = new Instances(m_instances, m_NumClusters); //generate new empty Instances Object with traininginstance headerinformation
		m_ClusterAssignments = new int[m_instances.numInstances()]; 

		Random RandomO = new Random(m_Seed);
		// Old way to assign cluster centroids by taking random data from the trainingset
		/*
		for (int i = 0; i < m_NumClusters; i++)
		{
			int instIndex = Math.abs(RandomO.nextInt())
					% m_instances.numInstances();
			m_ClusterCentroids.add(m_instances.instance(instIndex));
		}
		*/
		
		m_ClusterCentroids.add(normalCentroid);
		for(int i = 1; i < m_NumClusters; i++){
			m_ClusterCentroids.add(attackCentroids.instance(i));
		}
		
		boolean converged = false;
		while (!converged)
		{
			converged = true;
			for (int i = 0; i < m_instances.numInstances(); i++)
			{
				Instance toCluster = m_instances.instance(i);
				int newC = clusterProcessedInstance(toCluster);
				if (newC != m_ClusterAssignments[i])
				{
					converged = false;
				}
				m_ClusterAssignments[i] = newC;
			}

			Instances[] tempI = new Instances[m_NumClusters];
			// update centroids
			m_ClusterCentroids = new Instances(m_instances, m_NumClusters);
			for (int i = 0; i < m_NumClusters; i++)
			{
				tempI[i] = new Instances(m_instances, 0);
			}
			for (int i = 0; i < m_instances.numInstances(); i++)
			{
				tempI[m_ClusterAssignments[i]].add(m_instances.instance(i));
			}
			for (int i = 0; i < m_NumClusters; i++)
			{
				double[] vals = new double[m_instances.numAttributes()];
				for (int j = 0; j < m_instances.numAttributes(); j++)
				{
					vals[j] = tempI[i].meanOrMode(j);
				}
				m_ClusterCentroids.add(new Instance(1.0, vals));
			}
		}
	}

	/**
	 * clusters an instance that has been through the filters
	 * 
	 * @param instance
	 *            the instance to assign a cluster to
	 * @return a cluster number
	 */
	private int clusterProcessedInstance(Instance instance)
	{
		double minDist = Integer.MAX_VALUE;
		int bestCluster = 0;
		for (int i = 0; i < m_NumClusters; i++)
		{
			double dist = distance(instance, m_ClusterCentroids.instance(i));
			if (dist < minDist)
			{
				minDist = dist;
				bestCluster = i;
			}
		}
		return bestCluster;
	}

	/**
	 * Classifies a given instance.
	 * 
	 * @param instance
	 *            the instance to be assigned to a cluster
	 * @return the number of the assigned cluster as an interger
	 *         if the class is enumerated, otherwise the predicted value
	 * @exception Exception
	 *                if instance could not be classified
	 *                successfully
	 */
	public int clusterInstance(Instance instance) throws Exception
	{


		return clusterProcessedInstance(instance);
	}

	/**
	 * Calculates the distance between two instances
	 * 
	 * @param test
	 *            the first instance
	 * @param train
	 *            the second instance
	 * @return the distance between the two given instances, between 0 and 1
	 */
	private double distance(Instance first, Instance second)
	{

		double distance = 0;
		int firstI, secondI;

		for (int p1 = 0, p2 = 0; p1 < first.numValues()
				|| p2 < second.numValues();)
		{
			if (p1 >= first.numValues())
			{
				firstI = m_instances.numAttributes();
			}
			else
			{
				firstI = first.index(p1);
			}
			if (p2 >= second.numValues())
			{
				secondI = m_instances.numAttributes();
			}
			else
			{
				secondI = second.index(p2);
			}
			if (firstI == m_instances.classIndex())
			{
				p1++;
				continue;
			}
			if (secondI == m_instances.classIndex())
			{
				p2++;
				continue;
			}
			double diff;
			if (firstI == secondI)
			{
				diff = difference(firstI, first.valueSparse(p1),
						second.valueSparse(p2));
				p1++;
				p2++;
			}
			else if (firstI > secondI)
			{
				diff = difference(secondI, 0, second.valueSparse(p2));
				p2++;
			}
			else
			{
				diff = difference(firstI, first.valueSparse(p1), 0);
				p1++;
			}
			distance += diff * diff;
		}

		return Math.sqrt(distance / m_instances.numAttributes());
	}

	/**
	 * Computes the difference between two given attribute
	 * values.
	 */
	private double difference(int index, double val1, double val2)
	{

		switch(m_instances.attribute(index).type())
		{
			case Attribute.NOMINAL:

				// If attribute is nominal
				if (Instance.isMissingValue(val1)
						|| Instance.isMissingValue(val2)
						|| ((int) val1 != (int) val2))
				{
					return 1;
				}
				else
				{
					return 0;
				}
			case Attribute.NUMERIC:

				// If attribute is numeric
				if (Instance.isMissingValue(val1)
						|| Instance.isMissingValue(val2))
				{
					if (Instance.isMissingValue(val1)
							&& Instance.isMissingValue(val2))
					{
						return 1;
					}
					else
					{
						double diff;
						if (Instance.isMissingValue(val2))
						{
							diff = norm(val1, index);
						}
						else
						{
							diff = norm(val2, index);
						}
						if (diff < 0.5)
						{
							diff = 1.0 - diff;
						}
						return diff;
					}
				}
				else
				{
					return norm(val1, index) - norm(val2, index);
				}
			default:
				return 0;
		}
	}

	/**
	 * Normalizes a given value of a numeric attribute.
	 * 
	 * @param x
	 *            the value to be normalized
	 * @param i
	 *            the attribute's index
	 */
	private double norm(double x, int i)
	{

		if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i], m_Min[i]))
		{
			return 0;
		}
		else
		{
			return (x - m_Min[i]) / (m_Max[i] - m_Min[i]);
		}
	}

	/**
	 * Updates the minimum and maximum values for all the attributes
	 * based on a new instance.
	 * 
	 * @param instance
	 *            the new instance
	 */
	private void updateMinMax(Instance instance)
	{

		for (int j = 0; j < m_instances.numAttributes(); j++)
		{
			if (!instance.isMissing(j))
			{
				if (Double.isNaN(m_Min[j]))
				{
					m_Min[j] = instance.value(j);
					m_Max[j] = instance.value(j);
				}
				else
				{
					if (instance.value(j) < m_Min[j])
					{
						m_Min[j] = instance.value(j);
					}
					else
					{
						if (instance.value(j) > m_Max[j])
						{
							m_Max[j] = instance.value(j);
						}
					}
				}
			}
		}
	}

	/**
	 * Returns the number of clusters.
	 * 
	 * @return the number of clusters generated for a training dataset.
	 * @exception Exception
	 *                if number of clusters could not be returned
	 *                successfully
	 */
	public int numberOfClusters() throws Exception
	{
		return m_NumClusters;
	}

	/**
	 * Returns an enumeration describing the available options.
	 * <p>
	 * 
	 * Valid options are:
	 * <p>
	 * 
	 * -N <number of clusters> <br>
	 * Specify the number of clusters to generate. If omitted, EM will use cross
	 * validation to select the number of clusters automatically.
	 * <p>
	 * 
	 * -S <seed> <br>
	 * Specify random number seed.
	 * <p>
	 * 
	 * @return an enumeration of all the available options
	 * 
	 **/
	public Enumeration listOptions()
	{
		Vector newVector = new Vector(2);

		newVector.addElement(new Option("\tnumber of clusters. (default = 2).",
				"N", 1, "-N <num>"));
		newVector.addElement(new Option("\trandom number seed.\n (default 10)",
				"S", 1, "-S <num>"));

		return newVector.elements();
	}

	/**
	 * Returns the tip text for this property
	 * 
	 * @return tip text for this property suitable for
	 *         displaying in the explorer/experimenter gui
	 */
	public String numClustersTipText()
	{
		return "set number of clusters";
	}

	/**
	 * set the number of clusters to generate
	 * 
	 * @param n
	 *            the number of clusters to generate
	 */
	public void setNumClusters(int n)
	{
		m_NumClusters = n;
	}

	/**
	 * gets the number of clusters to generate
	 * 
	 * @return the number of clusters to generate
	 */
	public int getNumClusters()
	{
		return m_NumClusters;
	}

	/**
	 * Returns the tip text for this property
	 * 
	 * @return tip text for this property suitable for
	 *         displaying in the explorer/experimenter gui
	 */
	public String seedTipText()
	{
		return "random number seed";
	}

	/**
	 * Set the random number seed
	 * 
	 * @param s
	 *            the seed
	 */
	public void setSeed(int s)
	{
		m_Seed = s;
	}

	/**
	 * Get the random number seed
	 * 
	 * @return the seed
	 */
	public int getSeed()
	{
		return m_Seed;
	}

	/**
	 * Parses a given list of options.
	 * 
	 * @param options
	 *            the list of options as an array of strings
	 * @exception Exception
	 *                if an option is not supported
	 * 
	 **/
	public void setOptions(String[] options) throws Exception
	{

		String optionString = Utils.getOption('N', options);

		if (optionString.length() != 0)
		{
			setNumClusters(Integer.parseInt(optionString));
		}

		optionString = Utils.getOption('S', options);

		if (optionString.length() != 0)
		{
			setSeed(Integer.parseInt(optionString));
		}
	}

	/**
	 * Gets the current settings of SimpleKMeans
	 * 
	 * @return an array of strings suitable for passing to setOptions()
	 */
	public String[] getOptions()
	{
		String[] options = new String[4];
		int current = 0;

		options[current++] = "-N";
		options[current++] = "" + getNumClusters();
		options[current++] = "-S";
		options[current++] = "" + getSeed();

		while (current < options.length)
		{
			options[current++] = "";
		}

		return options;
	}

	/**
	 * return a string describing this clusterer
	 * 
	 * @return a description of the clusterer as a string
	 */
	public String toString()
	{
		StringBuffer temp = new StringBuffer();

		temp.append("\nkMeans\n======\n");

		temp.append("\nCluster centroids:\n");
		for (int i = 0; i < m_NumClusters; i++)
		{
			temp.append("\nCluster " + i + "\n\t");
			for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++)
			{
				if (m_ClusterCentroids.attribute(j).isNominal())
				{
					temp.append(" "
							+ m_ClusterCentroids.attribute(j).value(
									(int) m_ClusterCentroids.instance(i).value(
											j)));
				}
				else
				{
					temp.append(" " + m_ClusterCentroids.instance(i).value(j));
				}
			}
		}
		return temp.toString();
	}

	/**
	 * Main method for testing this class.
	 * 
	 * @param argv
	 *            should contain the following arguments:
	 *            <p>
	 *            -t training file [-N number of clusters]
	 */
	public static void main(String[] argv)
	{
		try
		{
			System.out.println(ClusterEvaluation.evaluateClusterer(
					(weka.clusterers.Clusterer) new SimpleKMean(), argv));
		}
		catch(Exception e)
		{
			System.out.println(e.getMessage());
			e.printStackTrace();
		}
	}
}